filter_features <- function(df_train_sm){
  df_fs <- df_train_sm %>%
   select(-engagement) %>%
   select(-content_crashes) %>%
   select(-client_id) %>%
   select(-label_beta) %>%
   select(-label_release) %>%
   select(-is_release) %>%
   select(-app_version)
  return(df_fs)
}

tl;dr

This analysis is focused on utilizing Boruta as a initial pre-filter to the covariates, to narrow the feature selection search space.

Method

Apply Boruta to each performance covariate.

engagement <- c('active_hours','active_hours_max','uri_count','uri_count_max','search_count','search_count_max','num_pages','num_pages_max','daily_max_tabs','daily_max_tabs_max','daily_unique_domains','daily_unique_domains_max','daily_tabs_opened','daily_tabs_opened_max')

df_train_sm_1 <- df_train_encoder %>%
   sample_n(1000)
df_train_sm_2 <- df_train_encoder %>%
   sample_n(1000)
df_train_sm_3 <- df_train_encoder %>%
   sample_n(1000)
#df_train_sm_4 <- df_train_encoder %>% sample_n(1000)
#df_train_sm_5 <- df_train_encoder %>% sample_n(1000)

df_fs_1 <- filter_features(df_train_sm_1)
df_fs_2 <- filter_features(df_train_sm_2)
df_fs_3 <- filter_features(df_train_sm_3)
#df_fs_4 <- filter_features(df_train_sm_4)
#df_fs_5 <- filter_features(df_train_sm_5)

Boruta is a feature selection algorithm based on the random forest algorithm. In the process of deciding if a feature is important or not, some features may be marked as Tentative. Sometimes increasing the maxRuns can help resolve the Tentativeness of the feature.

boruta_results_1 <- list()
boruta_results_2 <- list()
boruta_results_3 <- list()
#boruta_results_4 <- list()
#boruta_results_5 <- list()

for (metric in engagement){
  print(paste('Applying Boruta to ', metric))
  boruta.out <- Boruta(y = df_train_sm_1[[metric]], x=df_fs_1, doTrace=0)
  boruta_results_1[[metric]]  <- boruta.out
  
  boruta.out <- Boruta(y = df_train_sm_2[[metric]], x=df_fs_2, doTrace=0)
  boruta_results_2[[metric]]  <- boruta.out
  
  boruta.out <- Boruta(y = df_train_sm_3[[metric]], x=df_fs_3, doTrace=0)
  boruta_results_3[[metric]]  <- boruta.out
  #boruta.out <- Boruta(y = df_train_sm_4[[metric]], x=df_fs_4, doTrace=0)
  #boruta_results_4[[metric]]  <- boruta.out
  #boruta.out <- Boruta(y = df_train_sm_5[[metric]], x=df_fs_5, doTrace=0)
  #boruta_results_5[[metric]]  <- boruta.out
}
## [1] "Applying Boruta to  active_hours"
## [1] "Applying Boruta to  active_hours_max"
## [1] "Applying Boruta to  uri_count"
## [1] "Applying Boruta to  uri_count_max"
## [1] "Applying Boruta to  search_count"
## [1] "Applying Boruta to  search_count_max"
## [1] "Applying Boruta to  num_pages"
## [1] "Applying Boruta to  num_pages_max"
## [1] "Applying Boruta to  daily_max_tabs"
## [1] "Applying Boruta to  daily_max_tabs_max"
## [1] "Applying Boruta to  daily_unique_domains"
## [1] "Applying Boruta to  daily_unique_domains_max"
## [1] "Applying Boruta to  daily_tabs_opened"
## [1] "Applying Boruta to  daily_tabs_opened_max"
for (metric in engagement){
  plot(boruta_results_1[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
  plot(boruta_results_2[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
  plot(boruta_results_3[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
}

Find the top 5 ranking features per metric, and add to a list.

features_top5_1 <- NULL
features_top5_2 <- NULL
features_top5_3 <- NULL

for(metric in engagement){
  features_top5_1 <- c(names(sort(apply(boruta_results_1[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5_1)
  features_top5_2 <- c(names(sort(apply(boruta_results_2[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5_2)
  features_top5_3 <- c(names(sort(apply(boruta_results_3[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5_3)
}

top5_1 <- sort(c(unique(features_top5_1)))
top5_2 <- sort(c(unique(features_top5_2)))
top5_3 <- sort(c(unique(features_top5_3)))

n <- max(length(top5_1), length(top5_2), length(top5_3))
length(top5_1) <- n 
length(top5_2) <- n 
length(top5_3) <- n 

x <- data.frame(top5_1, top5_2, top5_3)
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
top5_1 top5_2 top5_3
daily_num_sessions_started cpu_cores daily_num_sessions_started
daily_num_sessions_started_max daily_num_sessions_started daily_num_sessions_started_max
FX_PAGE_LOAD_MS_2_PARENT daily_num_sessions_started_max FX_PAGE_LOAD_MS_2_PARENT
num_active_days FX_PAGE_LOAD_MS_2_PARENT memory_mb
num_bookmarks memory_cat num_active_days
profile_age memory_mb num_bookmarks
profile_age_cat num_active_days profile_age
session_length num_bookmarks profile_age_cat
session_length_max profile_age session_length
TIME_TO_DOM_COMPLETE_MS profile_age_cat session_length_max
TIME_TO_DOM_CONTENT_LOADED_END_MS session_length startup_ms_max
TIME_TO_DOM_INTERACTIVE_MS session_length_max TIME_TO_DOM_COMPLETE_MS
TIME_TO_LOAD_EVENT_END_MS TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_DOM_INTERACTIVE_MS
TIME_TO_NON_BLANK_PAINT_MS TIME_TO_DOM_INTERACTIVE_MS TIME_TO_LOAD_EVENT_END_MS
timezone_cat_(4,6] TIME_TO_LOAD_EVENT_END_MS TIME_TO_NON_BLANK_PAINT_MS
NA TIME_TO_NON_BLANK_PAINT_MS NA
NA timezone_cat_(6,8] NA

Increasing to 10:

features_top10_1 <- NULL
features_top10_2 <- NULL
features_top10_3 <- NULL

for(metric in engagement){
  features_top10_1 <- c(names(sort(apply(boruta_results_1[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10_1)
  features_top10_2 <- c(names(sort(apply(boruta_results_2[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10_2)
  features_top10_3 <- c(names(sort(apply(boruta_results_3[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10_3)
}

top10_1 <- sort(c(unique(features_top10_1)))
top10_2 <- sort(c(unique(features_top10_2)))
top10_3 <- sort(c(unique(features_top10_3)))

n <- max(length(top10_1), length(top10_2), length(top10_3))
length(top10_1) <- n 
length(top10_2) <- n 
length(top10_3) <- n 

x <- data.frame(top10_1, top10_2, top10_3)
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
top10_1 top10_2 top10_3
daily_num_sessions_started cpu_cores daily_num_sessions_started
daily_num_sessions_started_max cpu_speed_mhz daily_num_sessions_started_max
FX_PAGE_LOAD_MS_2_PARENT daily_num_sessions_started FX_PAGE_LOAD_MS_2_PARENT
memory_mb daily_num_sessions_started_max memory_cat
num_active_days FX_PAGE_LOAD_MS_2_PARENT memory_mb
num_bookmarks memory_cat num_active_days
profile_age memory_mb num_bookmarks
profile_age_cat num_active_days profile_age
session_length num_addons profile_age_cat
session_length_max num_bookmarks session_length
startup_ms profile_age session_length_max
startup_ms_max profile_age_cat startup_ms
TIME_TO_DOM_COMPLETE_MS session_length startup_ms_max
TIME_TO_DOM_CONTENT_LOADED_END_MS session_length_max TIME_TO_DOM_COMPLETE_MS
TIME_TO_DOM_INTERACTIVE_MS startup_ms TIME_TO_DOM_CONTENT_LOADED_END_MS
TIME_TO_LOAD_EVENT_END_MS startup_ms_max TIME_TO_DOM_INTERACTIVE_MS
TIME_TO_NON_BLANK_PAINT_MS TIME_TO_DOM_COMPLETE_MS TIME_TO_LOAD_EVENT_END_MS
timezone_cat_(4,6] TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_NON_BLANK_PAINT_MS
NA TIME_TO_DOM_INTERACTIVE_MS timezone_offset
NA TIME_TO_LOAD_EVENT_END_MS NA
NA TIME_TO_NON_BLANK_PAINT_MS NA
NA timezone_cat_(6,8] NA

Equal Labels

As we can see, using different samples hardly changes the result. Therefore, we can use only one df with 1000 samples. Equalize by label, then perform the above.

df_beta <- df_train_encoder %>% 
  filter(label_beta == 1)
n_beta <- nrow(df_beta)
set.seed(1234)
df_rel <- df_train_encoder %>% 
  filter(label_beta == 0) %>%
  sample_n(n_beta)
set.seed(1234)
df_train_f_sm_eq <- df_rel %>%
  bind_rows(df_beta) %>%
   sample_n(1000)
df_fs_eq <- df_train_f_sm_eq %>%
   select(-engagement) %>%
   select(-content_crashes) %>%
   select(-client_id) %>%
   select(-label_beta) %>%
   select(-label_release) %>%
   select(-is_release) %>%
   select(-app_version)
boruta_results_eq <- list()
for (metric in engagement){
  print(paste('Applying Boruta to ', metric))
  boruta.out <- Boruta(y = df_train_f_sm_eq[[metric]], x=df_fs_eq, doTrace=0)
  boruta_results_eq[[metric]]  <- boruta.out
  # plot(boruta.out, cex.axis=.7, las=2, xlab="", main=metric) 
}
## [1] "Applying Boruta to  active_hours"
## [1] "Applying Boruta to  active_hours_max"
## [1] "Applying Boruta to  uri_count"
## [1] "Applying Boruta to  uri_count_max"
## [1] "Applying Boruta to  search_count"
## [1] "Applying Boruta to  search_count_max"
## [1] "Applying Boruta to  num_pages"
## [1] "Applying Boruta to  num_pages_max"
## [1] "Applying Boruta to  daily_max_tabs"
## [1] "Applying Boruta to  daily_max_tabs_max"
## [1] "Applying Boruta to  daily_unique_domains"
## [1] "Applying Boruta to  daily_unique_domains_max"
## [1] "Applying Boruta to  daily_tabs_opened"
## [1] "Applying Boruta to  daily_tabs_opened_max"
for (metric in names(boruta_results_eq)){
  plot(boruta_results_eq[[metric]], cex.axis=.7, las=2, xlab="", main=metric) 
}

features_top5 <- NULL
for(metric in names(boruta_results_eq)){
  features_top5 <- c(names(sort(apply(boruta_results_eq[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5)
}

x <- data.frame(top5 = sort(c(unique(features_top5))))
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
top5
daily_num_sessions_started
daily_num_sessions_started_max
FX_PAGE_LOAD_MS_2_PARENT
memory_mb
num_active_days
num_addons
num_bookmarks
profile_age
profile_age_cat
session_length
session_length_max
TIME_TO_DOM_COMPLETE_MS
TIME_TO_DOM_INTERACTIVE_MS
TIME_TO_LOAD_EVENT_END_MS
TIME_TO_NON_BLANK_PAINT_MS

Increasing to 10:

features_top10 <- NULL
for(metric in names(boruta_results_eq)){
  features_top10 <- c(names(sort(apply(boruta_results_eq[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10)
}

x <- data.frame(top10 = sort(c(unique(features_top10))))
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
top10
cpu_speed_mhz
daily_num_sessions_started
daily_num_sessions_started_max
FX_PAGE_LOAD_MS_2_PARENT
memory_mb
num_active_days
num_addons
num_bookmarks
profile_age
profile_age_cat
session_length
session_length_max
startup_ms
startup_ms_max
TIME_TO_DOM_COMPLETE_MS
TIME_TO_DOM_CONTENT_LOADED_END_MS
TIME_TO_DOM_INTERACTIVE_MS
TIME_TO_LOAD_EVENT_END_MS
TIME_TO_NON_BLANK_PAINT_MS
timezone_cat_(4,6]
timezone_offset

Saving

save.image(file = "feature_selection.RData")